MegaMUGA Annotations

Reading in reference genotypes and metadata

First I read in the reference sample genotypes, as well as marker annotations from an analysis previously conducted by Karl Broman, Dan Gatti, and Belinda Cornes.

# Reading in "reference sample genotype data
control_genotypes <- suppressWarnings(data.table::fread("../data/MMControls/control.genotypes.csv"))
colnames(control_genotypes)[1] <- "marker"

## Reading in marker annotations
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")

Marker QC: Searching for missing genotype calls

We searched for probes where many mice are missing genotype calls.

## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
  tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
  dplyr::group_by(marker, genotype) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(freq = round(n/sum(n), 3),
                genotype = as.factor(genotype)) %>%
  dplyr::left_join(., mm_metadata)

## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
  dplyr::filter(genotype == "N") %>%
  tidyr::pivot_wider(names_from = genotype, values_from = n) %>%
  dplyr::select(marker, chr, bp_grcm39, freq) %>%
  dplyr::mutate(chr = as.factor(chr))
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
  dplyr::filter(freq > cutoff)

Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.

Sample QC: Searching for samples with poor marker representation

In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains with identical names meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise. Mouse over individual samples to see the number of markers with missing genotypes for each sample.

n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], MARGIN = 2, function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
  dplyr::rename(n.no.calls = n.calls.strains)

sampleQC <- ggplot(n.calls.strains.df, 
                   mapping = aes(x = reorder(sample,n.no.calls), 
                                 y = n.no.calls,
                                 text = paste("Sample:", sample))) + 
  geom_point() +
  QCtheme + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(x = "Number of mice with missing genotypes",
       y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))
high.n.samples <- n.calls.strains.df %>%
  dplyr::filter(n.no.calls > quantile(n.calls.strains.df$n.no.calls, probs = seq(0,1,0.05))[20])

Validating sex of reference samples

We next validated the sexes of each sample using sex chromosome probe intensities. We paired up probe intensities, joined available metadata, and filtered down to only markers covering the X and Y chromosomes.

## Reading in genotype intensities
x_intensities <- suppressWarnings(data.table::fread("../data/MMControls/control.X.csv"))
colnames(x_intensities)[1] <- "marker"
y_intensities <- suppressWarnings(data.table::fread("../data/MMControls/control.Y.csv"))
colnames(y_intensities)[1] <- "marker"
## Check to see if dimensions of intensity tables are identical, marker orders identical, and sample orders identical
if((unique(dim(x_intensities) == dim(y_intensities)) && 
   unique(colnames(x_intensities) == colnames(y_intensities)) &&
   unique(x_intensities$marker == y_intensities$marker)) == TRUE){
     ## Pivoting the data longer
  x_int_long <- x_intensities %>%
  tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "x_int")
  y_int_long <- y_intensities %>%
  tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "y_int")
  
  long_intensities <- cbind(x_int_long, y_int_long)
} else {
     print("Source intensity data frames have non-identical structure; exiting")
}

## Joining slimmer intensity files with marker metadata and reducing to Chromosome X markers
long_XY_intensities <- long_intensities[,c(1,2,3,6)] %>%
  dplyr::left_join(., mm_metadata) %>%
  dplyr::filter(chr %in% c("X","Y"))

Then we flagged markers with high missingness across all samples, as well as samples with high missingness among all markers.

## Flagging markers and samples based on previous QC steps
flagged_XY_intensities <- long_XY_intensities %>%
  dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
                                             true = "FLAG",
                                             false = "")) %>%
  dplyr::mutate(high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
                                                     true = "FLAG",
                                                     false = ""))

The first round of inferring predicted sexes used a rough search of the sample name for expected nomenclature convention, which includes a sex denotation.

## First round of predicted sex inference
prelim.predicted.sexes <- flagged_XY_intensities %>%
  dplyr::mutate(bg = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1") == TRUE ~ "F1",
                                      TRUE ~ as.character(sample)),
                predicted.sex = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1f") == TRUE ~ "f",
                                      stringr::str_detect(string = sample, 
                                                          pattern = "F1m") == TRUE ~ "m",
                                      TRUE ~ "unknown"))

unknown <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex == "unknown")

digit.trim <- unknown %>% 
  dplyr::mutate(mouse.id.1 = stringr::str_sub(sample, -1),
                predicted.sex.1 = dplyr::case_when(stringr::str_sub(mouse.id.1, 1, 1) %in% c("m","M") ~ "m",
                                                   stringr::str_sub(mouse.id.1, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.1 == "m" | predicted.sex.1 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.1, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.3= stringr::str_sub(sample, -3),
                predicted.sex.3 = dplyr::case_when(stringr::str_sub(mouse.id.3, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.3, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.3 == "m" | predicted.sex.3 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.3, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.4 = stringr::str_sub(sample, -4),
                predicted.sex.4 = dplyr::case_when(stringr::str_sub(mouse.id.4, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.4, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.4 == "m" | predicted.sex.4 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.4, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.5 = stringr::str_sub(sample, -5),
                mouse.id.5 = stringr::str_replace(string = mouse.id.5,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
                             true = str_replace(string = bg,
                                                pattern = mouse.id.5,
                                                replacement = ""),
                             false = bg),
                
                mouse.id.6 = stringr::str_sub(sample, -6),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:punct:]", 
                                                  replacement = ""),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.6 = dplyr::case_when(stringr::str_sub(mouse.id.6, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.6, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.6 == "m" | predicted.sex.6 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.6, 
                                                replacement = ""), 
                             false = bg)) %>%
  dplyr::mutate(predicted.sex = dplyr::case_when(predicted.sex.1 == "m" ~ "m", 
                                                 predicted.sex.3 == "m" ~ "m",
                                                 predicted.sex.4 == "m" ~ "m",
                                                 predicted.sex.5 == "m" ~ "m",
                                                 predicted.sex.6 == "m" ~ "m",
                                                 
                                                 predicted.sex.1 == "f" ~ "f", 
                                                 predicted.sex.3 == "f" ~ "f",
                                                 predicted.sex.4 == "f" ~ "f",
                                                 predicted.sex.5 == "f" ~ "f",
                                                 predicted.sex.6 == "f" ~ "f",
                                                 TRUE ~ "unknown"))

# Removing previously "unknown" samples from initial results and binding newly inferred samples
predicted.sexes.strings <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex != "unknown") %>%
  dplyr::bind_rows(.,digit.trim)

## Taking the first marker as a sample and tabulating the number of samples for each predicted sex
predicted.sex.table <- predicted.sexes.strings %>%
  dplyr::filter(marker %in% unique(prelim.predicted.sexes$marker)[1]) %>%
  dplyr::select(sample, predicted.sex, bg) %>%
  dplyr::group_by(predicted.sex) %>%
  dplyr::count() 

This captured 94 female samples, 249 male samples, leaving 21 samples of unknown predicted sex from nomenclature alone. These samples (below) exhibit a range of naming irregularities.

predicted.sexes.strings %>%
  dplyr::filter(predicted.sex == "unknown",
                marker == predicted.sexes.strings$marker[[1]]) %>%
  dplyr::select(sample, bg)
##               sample                bg
## 1           34-HG-F1                F1
## 2           36-PG-F1                F1
## 3             BAG74u            BAG74u
## 4              BAG99             BAG99
## 5           CAST/EiJ          CAST/EiJ
## 6               IN13              IN13
## 7               IN25              IN25
## 8               IN34              IN34
## 9               IN40              IN40
## 10              IN47              IN47
## 11              IN54              IN54
## 12 KOMP cell DNA JM8 KOMP cell DNA JM8
## 13 KOMP cell DNA JM8 KOMP cell DNA JM8
## 14 KOMP cell DNA JM8 KOMP cell DNA JM8
## 15           MWN1026           MWN1026
## 16           MWN1030           MWN1030
## 17           MWN1194           MWN1194
## 18           MWN1198           MWN1198
## 19          MWN1214u          MWN1214u
## 20           MWN1279           MWN1279
## 21           MWN1287           MWN1287

After predicting the sexes of the vast majority of reference samples, we visualized the average probe intensity among X Chromosome markers for each sample, labeling them by predicted sex. Samples colored black were unabled to have their sex inferred by the sample name, but cluster well with mice for which sex could be inferred. Conversely, some samples’ predicted sex is discordant with X and Y Chromosome marker intensities (i.e. blue samples that cluster with mostly orange samples, and vice versa). Mouse over individual dots to view the sample, as well as whether it was flagged for having many markers with missing genotype information.

mean.x.intensities.by.sex <- predicted.sexes.strings %>%
  dplyr::filter(marker_flag != "FLAG") %>%
  dplyr::group_by(sample, predicted.sex, chr, high_missing_sample) %>%
  dplyr::summarise(mean.x = mean(x_int),
                   mean.y = mean(y_int))

predicted.sex.plot.palettes <- mean.x.intensities.by.sex %>%
  dplyr::ungroup() %>%
  dplyr::distinct(sample, predicted.sex, high_missing_sample) %>%
  dplyr::mutate(predicted.sex.palette = dplyr::case_when(predicted.sex == "f" ~ "#5856b7",
                                                         predicted.sex == "m" ~ "#eeb868",
                                                         predicted.sex == "unknown" ~ "black"))

predicted.sex.palette <- predicted.sex.plot.palettes$predicted.sex.palette
names(predicted.sex.palette) <- predicted.sex.plot.palettes$predicted.sex

mean.x.intensities.by.sex.plot <- ggplot(mean.x.intensities.by.sex, 
                                         mapping = aes(x = mean.x, 
                                                       y = mean.y, 
                                                       colour = predicted.sex,
                                                       shape = high_missing_sample,
                                                       text = sample,
                                                       label = high_missing_sample)) + 
  geom_point() + 
  scale_colour_manual(values = predicted.sex.palette) + 
  facet_grid(.~chr) +
  QCtheme
ggplotly(mean.x.intensities.by.sex.plot, 
         tooltip = c("text","label"),)